library(car)
library(mosaic)
library(DT)
library(tidyverse)

Read in the Data

Made by Lance, Logan, and Daniel. Let’s take a look.

files <- dir()
rdat <- read.csv(grep("RBdata.csv", files, value=TRUE), header=TRUE)
as.tibble(rdat)
# A tibble: 100 x 11
        Y    X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
    <dbl> <dbl> <int> <dbl> <int> <int> <dbl> <dbl> <int> <dbl> <dbl>
 1   4.55  35.4     0 3.93      0     0 7.56  2.29      0  27.2  3.29
 2   1.52  89.3     0 0.244     0     1 3.70  2.51      0  28.7 10.5 
 3  -3.43  40.8     1 3.89      0     0 9.07  1.81      0  44.8  3.54
 4  11.2   87.5     0 1.07      1     0 9.99  2.24      0  44.3 32.1 
 5  -1.29  15.7     1 2.99      0     0 5.94  2.58      1  48.7 13.0 
 6   2.88  16.6     1 4.63      0     0 1.23  1.20      0  85.4 30.6 
 7  -6.58  43.0     0 0.940     0     1 3.55  0.886     1  32.9 32.5 
 8  26.4   12.2     1 4.46      1     0 0.761 1.05      1  94.7 17.6 
 9 -22.2  129.      1 1.07      0     2 2.70  0.277     1  81.3 13.2 
10 -16.2   60.9     1 3.73      0     2 7.97  1.23      1  32.2 28.0 
# ... with 90 more rows

Create first Pairs Plot

pairs(rdat, pch=16, cex=1, panel=panel.smooth, col.smooth="skyblue4", col=rgb(.2,.2,.2,.8))

Start with X4

lm1 <- lm(Y ~ X4, data=rdat)
summary(lm1)
## 
## Call:
## lm(formula = Y ~ X4, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.137  -5.492   0.243   5.403  24.846 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.319      1.436  -0.919     0.36    
## X4            19.583      2.011   9.740 4.44e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.05 on 98 degrees of freedom
## Multiple R-squared:  0.4919, Adjusted R-squared:  0.4867 
## F-statistic: 94.86 on 1 and 98 DF,  p-value: 4.445e-16
palette(c("skyblue","orange"))
pairs(cbind(R=lm1$res, fit=lm1$fit, rdat), pch=16, cex=1, panel=panel.smooth, col.smooth="skyblue4", col=factor(rdat$X4))

Add X7

lm2 <- lm(Y ~ X4 + X7, data=rdat)
summary(lm2)
## 
## Call:
## lm(formula = Y ~ X4 + X7, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.146  -4.714  -0.500   5.928  20.371 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.946      2.181   2.268 0.025562 *  
## X4            19.536      1.894  10.313  < 2e-16 ***
## X7            -4.153      1.134  -3.662 0.000407 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.469 on 97 degrees of freedom
## Multiple R-squared:  0.5536, Adjusted R-squared:  0.5444 
## F-statistic: 60.15 on 2 and 97 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange"))
pairs(cbind(R=lm2$res, fit=lm2$fit, rdat), pch=16, cex=1, panel=panel.smooth, col.smooth="skyblue4", col=factor(rdat$X4))

Try X4:X7?

lm3 <- lm(Y ~ X4 + X7 + X4:X7, data=rdat)
summary(lm3)
## 
## Call:
## lm(formula = Y ~ X4 + X7 + X4:X7, data = rdat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.8575  -5.4744  -0.2018   4.6318  25.1179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.2725     2.6121  -0.104  0.91714    
## X4           30.2422     3.7158   8.139 1.44e-12 ***
## X7           -0.6939     1.5059  -0.461  0.64600    
## X4:X7        -7.1242     2.1613  -3.296  0.00137 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.022 on 96 degrees of freedom
## Multiple R-squared:  0.599,  Adjusted R-squared:  0.5865 
## F-statistic:  47.8 on 3 and 96 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange"))
pairs(cbind(R=lm3$res, fit=lm3$fit, rdat), pch=16, cex=1, panel=panel.smooth, col.smooth="skyblue4", col=factor(rdat$X4))

Drop X7 base term

lm4 <- lm(Y ~ X4 + X4:X7, data=rdat)
summary(lm4)
## 
## Call:
## lm(formula = Y ~ X4 + X4:X7, data = rdat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.8575  -5.6645  -0.0355   4.6318  25.1179 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.319      1.284  -1.028    0.307    
## X4            31.289      2.928  10.685  < 2e-16 ***
## X4:X7         -7.818      1.544  -5.063 1.96e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.985 on 97 degrees of freedom
## Multiple R-squared:  0.5981, Adjusted R-squared:  0.5898 
## F-statistic: 72.18 on 2 and 97 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange"))
pairs(cbind(R=lm4$res, fit=lm4$fit, rdat), pch=16, cex=1, panel=panel.smooth, col.smooth="skyblue4", col=factor(rdat$X8))

Add X8 and X7:X8

lm5 <- lm(Y ~ X4 + X8 + X4:X7 + X7:X8, data=rdat)
summary(lm5)
## 
## Call:
## lm(formula = Y ~ X4 + X8 + X4:X7 + X7:X8, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.579  -4.968   0.867   4.505  21.326 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.592      1.176  -2.203     0.03 *  
## X4            37.918      2.741  13.833  < 2e-16 ***
## X8           -14.378      3.388  -4.243 5.12e-05 ***
## X4:X7        -11.905      1.476  -8.067 2.18e-12 ***
## X8:X7         10.929      1.828   5.978 3.94e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.625 on 95 degrees of freedom
## Multiple R-squared:  0.7165, Adjusted R-squared:  0.7046 
## F-statistic: 60.03 on 4 and 95 DF,  p-value: < 2.2e-16
palette(c("skyblue","orange","firebrick","green"))
pairs(cbind(R=lm5$res, fit=lm5$fit, rdat), pch=16, cex=1, panel=panel.smooth, col.smooth="skyblue4", col=interaction(factor(rdat$X4),factor(rdat$X8)))

Let’s take a closer look

There seems to be two more lines within each color…

palette(c("skyblue","orange","firebrick","green"))
par(mfrow=c(2,1))
plot(Y ~ X7, data=rdat, col=interaction(X4,X8), pch=16, cex=2)
plot(lm5, which=1, col=interaction(rdat$X4,rdat$X8), pch=16, cex=2)

Try an idea…

lm6 <- lm(Y ~ X7*X4*X8*X2, data=rdat)
summary(lm6)
## 
## Call:
## lm(formula = Y ~ X7 * X4 * X8 * X2, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1123 -1.6950 -0.3323  1.5209  8.7835 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.3565     1.3817  -0.258 0.797044    
## X7            1.1105     0.8316   1.335 0.185349    
## X4           26.0763     2.0539  12.696  < 2e-16 ***
## X8          -14.8849     2.9224  -5.093 2.13e-06 ***
## X2           10.4574     1.9297   5.419 5.62e-07 ***
## X7:X4        -7.9249     1.2300  -6.443 6.98e-09 ***
## X7:X8         9.1779     1.6122   5.693 1.79e-07 ***
## X4:X8         7.3003     4.0129   1.819 0.072443 .  
## X7:X2        -9.2267     1.1673  -7.904 9.43e-12 ***
## X4:X2         9.6268     2.9044   3.315 0.001356 ** 
## X8:X2       -20.6577     4.2790  -4.828 6.12e-06 ***
## X7:X4:X8     -4.6715     2.3834  -1.960 0.053306 .  
## X7:X4:X2     -4.9587     1.7343  -2.859 0.005354 ** 
## X7:X8:X2      7.7992     2.2884   3.408 0.001006 ** 
## X4:X8:X2      7.0978     5.6363   1.259 0.211412    
## X7:X4:X8:X2  11.0406     3.1925   3.458 0.000856 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.956 on 84 degrees of freedom
## Multiple R-squared:  0.9623, Adjusted R-squared:  0.9556 
## F-statistic:   143 on 15 and 84 DF,  p-value: < 2.2e-16

Reduce it to…

lm7 <- lm(Y ~ X4 + X8 + X2 + X7:X4 + X7:X8 + X7:X2 + X4:X2 + X8:X2 + X4:X8 + X4:X8:X7 + X7:X4:X2 + X7:X4:X8:X2, data=rdat)
summary(lm7)
## 
## Call:
## lm(formula = Y ~ X4 + X8 + X2 + X7:X4 + X7:X8 + X7:X2 + X4:X2 + 
##     X8:X2 + X4:X8 + X4:X8:X7 + X7:X4:X2 + X7:X4:X8:X2, data = rdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2350 -2.1023 -0.1049  1.7914 10.1187 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.0174     0.7703   1.321 0.190051    
## X4           25.4852     1.6350  15.588  < 2e-16 ***
## X8          -20.4474     2.3233  -8.801 1.15e-13 ***
## X2            7.7289     1.5448   5.003 2.91e-06 ***
## X4:X7        -7.2293     0.9078  -7.964 5.87e-12 ***
## X8:X7        13.1312     1.1141  11.786  < 2e-16 ***
## X2:X7        -7.0674     0.7998  -8.837 9.70e-14 ***
## X4:X2        10.7584     2.4050   4.473 2.32e-05 ***
## X8:X2        -8.9983     1.8203  -4.943 3.70e-06 ***
## X4:X8        10.2995     2.8913   3.562 0.000599 ***
## X4:X8:X7     -7.2172     1.9058  -3.787 0.000280 ***
## X4:X2:X7     -6.2882     1.4672  -4.286 4.69e-05 ***
## X4:X8:X2:X7  16.4394     1.5154  10.849  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.131 on 87 degrees of freedom
## Multiple R-squared:  0.9562, Adjusted R-squared:  0.9502 
## F-statistic: 158.4 on 12 and 87 DF,  p-value: < 2.2e-16

Draw it…

I loved your data. Great challenge. Satisfying result!

palette(c("skyblue","skyblue4","orange","orangered", "firebrick", "firebrick4", "green", "limegreen"))
plot(Y ~ X7, data=rdat, pch=16, col=interaction(factor(rdat$X2),factor(rdat$X4),factor(rdat$X8)))
b <- coef(lm7)
abline(b[1], 0, col=palette()[1])         #X4=0, X8=0, X2=0
abline(b[1]+b[2], b[5], col=palette()[3]) #X4=1, X8=0, X2=0
abline(b[1]+b[3], b[6], col=palette()[5]) #X4=0, X8=1, X2=0
abline(b[1]+b[4], b[7], col=palette()[2]) #X4=0, X8=0, X2=1
abline(b[1]+b[2]+b[3]+b[10], b[5]+b[6]+b[11], col=palette()[7]) #X4=1, X8=1, X2=0
abline(b[1]+b[2]+b[4]+b[8], b[5]+b[7]+b[12], col=palette()[4]) #X4=1, X8=0, X2=1
abline(b[1]+b[3]+b[4]+b[9], b[6]+b[7], col=palette()[6]) #X4=0, X8=1, X2=1
abline(b[1]+b[2]+b[3]+b[4]+b[8]+b[9]+b[10], b[5]+b[6]+b[7]+b[11]+b[12]+b[13], col=palette()[8]) #X4=1, X8=1, X2=1